In [1]:
import numpy as np
import seaborn as sb
import pandas as pd
import MyML.cluster.K_Means3 as myKM
import MyML.cluster.eac as eac
import MyML.helper.partition as mpart
import MyML.metrics.accuracy as accuracy
import MyML.cluster.linkage as linkage
In [2]:
ls ../datasets/ecg_it/
In [31]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "ecg_it/"
dataname = foldername + "ecg_it.data"
In [515]:
np.savetxt(foldername + "data.csv", data, delimiter=',')
np.savetxt(foldername + "ground_truth.csv", gt, delimiter=',')
In [32]:
dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()
Out[32]:
In [24]:
raw = dataset.values[1:,1:]
dataset = pd.DataFrame(raw)
dataset.head()
Out[24]:
In [29]:
dataset.to_csv(dataname)
In [664]:
print dataset[64].unique()
print dataset[64].unique().size
In [421]:
dataset[dataset == 'y'] = 1
In [500]:
dataset[9].unique()
Out[500]:
In [427]:
dataset[dataset=='republican']=1
In [431]:
dataset = dataset.astype(np.int32)
In [436]:
new_cols = dataset.columns.astype(np.object)
In [416]:
for i in range(1,17):
dataset = dataset[dataset[i] != '?']
In [417]:
dataset.shape
Out[417]:
In [400]:
dataset.shape
Out[400]:
In [518]:
dataset=load_iris()
In [520]:
data=dataset.data.astype(np.float32)
gt=dataset.target
In [667]:
data = dataset.get_values()[:,:-1]
In [668]:
data = data.astype(np.float32)
data.shape
Out[668]:
In [669]:
gt=dataset.get_values()[:,-1]
In [670]:
gt = gt.astype(np.int32)
In [506]:
l=0
for i in np.unique(gt):
gt[gt==i] = l
l+=1
In [671]:
gt
Out[671]:
In [566]:
gt=np.zeros(600, dtype=np.int32)
for i in range(6):
gt[i * 100 : i * 100 + 100] = i
In [77]:
from sklearn.datasets import load_iris
iris=load_iris()
data=iris.data.astype(np.float32)
gt=iris.target
In [824]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "ionosphere/"
dataname = foldername + "ionosphere.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
print dataset.shape
dataset.head()
data = dataset.values[:,:-1].astype(np.float32)
gt = dataset.values[:,-1]
gt[gt=='g']=1
gt[gt=='b']=0
gt = gt.astype(np.int32)
In [681]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "optdigits/"
dataname = foldername + "optdigits.tra"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values()[:,:-1]
data = data.astype(np.float32)
gt=dataset.get_values()[:,-1]
gt = gt.astype(np.int32)
In [701]:
foldername = "/home/diogoaos/QCThesis/datasets/" + "mfeat/"
dataname = foldername + "mfeat-fou.asc"
dataset = pd.read_csv(dataname, header=None, sep=" ")
data = dataset.get_values().astype(np.float32)
gt = np.empty(dataset.shape[0], dtype=np.int32)
for i in range(10):
gt[i*200 : i*200+200]=i
In [149]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "isolet/"
dataname = foldername + "isolet1-5.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.get_values().astype(np.float32)[:,:-1]
gt=dataset.get_values()[:,-1].astype(np.int32)
In [90]:
foldername = "/home/chiroptera/QCThesis/datasets/" + "ecg_it/"
dataname = foldername + "ecg_it.data"
dataset = pd.read_csv(dataname, header=None, sep=",")
data = dataset.values[1:,1:-1].astype(np.float32)
gt = dataset.values[1:,-1].astype(np.int32)
#remove unlabeled
labeled_idx = gt!=2
data = data[labeled_idx]
gt = gt[labeled_idx]
In [79]:
gt0 = gt==0
gt1 = gt==1
In [80]:
data = np.concatenate((data[gt0],data[gt1][:600]))
gt = np.concatenate((gt[gt0],gt[gt1][:600]))
In [81]:
from sklearn.cluster import KMeans
In [143]:
reload(myKM)
Out[143]:
In [150]:
generator = myKM.K_Means()
generator._MAX_THREADS_BLOCK = 256
generator._PPT = 8
#generator = KMeans(init='random')
n_samples = data.shape[0]
true_n_clusters=np.unique(gt).size
sqrt_nsamples = np.sqrt(n_samples)
n_clusters = [sqrt_nsamples/2, sqrt_nsamples]
n_clusters = map(np.ceil, n_clusters)
n_clusters = map(int, n_clusters)
n_partitions = 100
rounds = 20
max_clusts=100
assoc_mode = "full"
prot_mode = "none"
print "number of samples: ", n_samples
print "interval of clusters: ", n_clusters
print "true number of clusters: ", true_n_clusters
print "number of partitions: ", n_partitions
print "number of rounds: ", rounds
print "maximum clusters assumption: ", max_clusts
In [151]:
%time ensemble = mpart.generateEnsemble(data, generator, n_clusters=n_clusters, npartitions=n_partitions, iters=3)
In [146]:
print generator._gridDim
print generator._gridDim * generator._MAX_THREADS_BLOCK * generator._PPT
sklearn 15.x mykm 2.66 1 ppt mykm
In [84]:
eacEst = eac.EAC(n_samples=n_samples, mat_sparse=False)
%time eacEst.fit(ensemble, assoc_mode=assoc_mode, prot_mode=prot_mode)
In [85]:
Z = eacEst._apply_linkage()
Z[:,:2]
Out[85]:
In [102]:
true_n_clusters=0
In [86]:
%time labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
accEst = accuracy.HungarianIndex(nsamples=n_samples)
%time accEst.score(gt, labels)
print accEst.accuracy
In [66]:
for l in np.unique(gt):
print l, (gt==l).sum()
In [87]:
for l in np.unique(labels):
print l, (labels==l).sum()
In [67]:
true_n_clusters = 0
In [68]:
thresholds = np.arange(0.05,1.01,0.05)
res = np.empty(((thresholds.size + 1) * rounds, 5))
# threshold, max_assocs, n_assocs, acc
n_cluster_ary = np.empty((thresholds.size + 1) * rounds, dtype=np.int32)
#progress bar
print ". " * rounds
i = 0
for r in range(rounds):
print ".",
ensemble = mpart.generateEnsemble(data, generator, n_clusters=n_clusters, npartitions=n_partitions, iters=3)
eacEst = eac.EAC(n_samples=n_samples, mat_sparse=False)
eacEst.fit(ensemble, assoc_mode=assoc_mode, prot_mode=prot_mode)
max_assocs, max_idx = eacEst.getMaxAssocs()
n_assocs = eacEst.getNNZAssocs()
labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
accEst = accuracy.HungarianIndex(nsamples=n_samples)
# HungarianIndex takes a huge time for a high cluster imbalance
if np.unique(labels).size > max_clusts:
accEst.accuracy = -1
else:
accEst.score(gt, labels)
res[i, 0] = 0
res[i, 1] = max_assocs
res[i, 2] = n_assocs
res[i, 3] = accEst.accuracy
res[i, 4] = r
n_cluster_ary[i] = np.unique(labels).size
i += 1
for j in range(thresholds.size):
eacEst.apply_threshold(thresholds[j])
max_assocs, max_idx = eacEst.getMaxAssocs()
n_assocs = eacEst.getNNZAssocs()
labels = eacEst._lifetime_clustering(n_clusters=true_n_clusters)
accEst = accuracy.HungarianIndex(nsamples=n_samples)
if np.unique(labels).size > max_clusts:
accEst.accuracy = -1
else:
accEst.score(gt, labels)
max_assocs, max_idx = eacEst.getMaxAssocs()
nnz_pc = eacEst.getNNZAssocs()
res[i, 0] = thresholds[j]
res[i, 1] = max_assocs
res[i, 2] = nnz_pc
res[i, 3] = accEst.accuracy
res[i, 4] = r
n_cluster_ary[i] = np.unique(labels).size
i += 1
resPD = pd.DataFrame(data=res, columns=["threshold", "max assoc", "n assocs", "accuracy", "round"])
print "MAX ACCURACY=", resPD['accuracy'].max()
if true_n_clusters == 0:
resPD['n_clusts']=n_cluster_ary
resPD
Out[68]:
In [922]:
print dataname
In [923]:
save_folder = "/home/diogoaos/QCThesis/experiments/threshold/"
save_name = "ionosphere_lifetime"
In [924]:
resPD.to_csv(path_or_buf = save_folder + save_name + ".csv", index=False)